3.1 Análisis Descriptivo

# librerías necesarias para implementar las funciones
library(readxl)
library(glue)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggmosaic)
library(ggridges)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ data.table::first()   masks dplyr::first()
## ✖ lubridate::hour()     masks data.table::hour()
## ✖ lubridate::isoweek()  masks data.table::isoweek()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ data.table::last()    masks dplyr::last()
## ✖ lubridate::mday()     masks data.table::mday()
## ✖ lubridate::minute()   masks data.table::minute()
## ✖ lubridate::month()    masks data.table::month()
## ✖ lubridate::quarter()  masks data.table::quarter()
## ✖ lubridate::second()   masks data.table::second()
## ✖ purrr::transpose()    masks data.table::transpose()
## ✖ lubridate::wday()     masks data.table::wday()
## ✖ lubridate::week()     masks data.table::week()
## ✖ lubridate::yday()     masks data.table::yday()
## ✖ lubridate::year()     masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
## 
## Attaching package: 'pastecs'
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## The following objects are masked from 'package:data.table':
## 
##     first, last
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(xtable)
library(here)
## here() starts at /Users/sofiabocker/Desktop/universidad/UCR/Actuariales/Cuarto año/I Ciclo/Estadística Actuarial I/Proyecto/cod
library(skimr) 
library(kableExtra) 
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
# importar base de datos 
base_datos <- read_excel("/Users/sofiabocker/Desktop/universidad/UCR/Actuariales/Cuarto año/I Ciclo/Estadística Actuarial I/Proyecto/base de datos/base_datos_alcohol.xlsx")
## New names:
## • `` -> `...32`
## • `` -> `...33`
base_datos <- base_datos [, -32]
base_datos <- base_datos [, -32]
base_datos <- head(base_datos, -25)

# Comprimir las variables de 5 categorías en variables de tres categorías <

base_datos_clean <- base_datos %>%
  clean_names() %>%
  mutate(alcohol_weekdays = fct_collapse(
    alcohol_weekdays,
    Low = c("Low", "Very Low"),
    High = c("High", "Very High"),
    Moderate = "Moderate"
  ))

#  Asegurarse que los datos se mantengan como characters

base_datos_clean$alcohol_weekdays <- as.character(base_datos_clean$alcohol_weekdays)

base_datos_clean <- base_datos_clean %>%
  clean_names() %>%
  mutate(alcohol_weekends = fct_collapse(
    alcohol_weekends,
    Low = c("Low", "Very Low"),
    High = c("High", "Very High"),
    Moderate = "Moderate"
  ))

base_datos_clean$alcohol_weekends <- as.character(base_datos_clean$alcohol_weekends)

base_datos_clean <- base_datos_clean %>%
  clean_names() %>%
  mutate(health_status = fct_collapse(
    health_status ,
    Poor = c("Poor", "Very Poor"),
    Good = c("Very Good", "Good"),
    Fair = "Fair"
  ))

base_datos_clean$health_status <- as.character(base_datos_clean$health_status)

base_datos_clean <- base_datos_clean %>%
  clean_names() %>%
  mutate(good_family_relationship = fct_collapse(
    good_family_relationship,
    Poor = c("Poor", "Very Poor"),
    Good = c("Excellent", "Good"),
    Fair = "Fair"
  ))

base_datos_clean$good_family_relationship <- as.character(base_datos_clean$good_family_relationship)

base_datos_clean <- base_datos_clean %>%
  clean_names() %>%
  mutate(free_time_after_school = fct_collapse(
    free_time_after_school,
    Low = c("Low", "Very Low"),
    High = c("High", "Very High"),
    Moderate = "Moderate"
  ))

base_datos_clean$free_time_after_school <- as.character(base_datos_clean$free_time_after_school)

base_datos_clean <- base_datos_clean %>%
  clean_names() %>%
  mutate(time_with_friends = fct_collapse(
    time_with_friends,
    Low = c("Low", "Very Low"),
    High = c("High", "Very High"),
    Moderate = "Moderate"
  ))

base_datos_clean$time_with_friends <- as.character(base_datos_clean$time_with_friends)

Data

 head(base_datos_clean) # muestra las primeras seis observaciones
## # A tibble: 6 × 31
##   school  gender   age housing_type family_size parental_status mother_education
##   <chr>   <chr>  <dbl> <chr>        <chr>       <chr>           <chr>           
## 1 Gabrie… Female    18 Urban        Above 3     Separated       Higher Education
## 2 Gabrie… Female    17 Urban        Above 3     Living Together Primary School  
## 3 Gabrie… Female    15 Urban        Up to 3     Living Together Primary School  
## 4 Gabrie… Female    15 Urban        Above 3     Living Together Higher Education
## 5 Gabrie… Female    16 Urban        Above 3     Living Together High School     
## 6 Gabrie… Male      16 Urban        Up to 3     Living Together Higher Education
## # ℹ 24 more variables: father_education <chr>, mother_work <chr>,
## #   father_work <chr>, reason_school_choice <chr>, legal_responsibility <chr>,
## #   commute_time <chr>, weekly_study_time <chr>,
## #   extra_educational_support <chr>, parental_educational_support <chr>,
## #   private_tutoring <chr>, extracurricular_activities <chr>,
## #   attended_daycare <chr>, desire_graduate_education <chr>,
## #   has_internet <chr>, is_dating <chr>, good_family_relationship <chr>, …
str <- str(base_datos_clean) # muestra la estructura de los datos
## tibble [649 × 31] (S3: tbl_df/tbl/data.frame)
##  $ school                      : chr [1:649] "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" "Gabriel Pereira" ...
##  $ gender                      : chr [1:649] "Female" "Female" "Female" "Female" ...
##  $ age                         : num [1:649] 18 17 15 15 16 16 16 17 15 15 ...
##  $ housing_type                : chr [1:649] "Urban" "Urban" "Urban" "Urban" ...
##  $ family_size                 : chr [1:649] "Above 3" "Above 3" "Up to 3" "Above 3" ...
##  $ parental_status             : chr [1:649] "Separated" "Living Together" "Living Together" "Living Together" ...
##  $ mother_education            : chr [1:649] "Higher Education" "Primary School" "Primary School" "Higher Education" ...
##  $ father_education            : chr [1:649] "Higher Education" "Primary School" "Primary School" "Lower Secondary School" ...
##  $ mother_work                 : chr [1:649] "Homemaker" "Homemaker" "Homemaker" "Health" ...
##  $ father_work                 : chr [1:649] "Teacher" "other" "other" "Services" ...
##  $ reason_school_choice        : chr [1:649] "Course Preference" "Course Preference" "Other" "Near Home" ...
##  $ legal_responsibility        : chr [1:649] "Mother" "Father" "Mother" "Mother" ...
##  $ commute_time                : chr [1:649] "15 to 30 min" "Up to 15 min" "Up to 15 min" "Up to 15 min" ...
##  $ weekly_study_time           : chr [1:649] "2 to 5h" "2 to 5h" "2 to 5h" "5 to 10h" ...
##  $ extra_educational_support   : chr [1:649] "Yes" "No" "Yes" "No" ...
##  $ parental_educational_support: chr [1:649] "No" "Yes" "No" "Yes" ...
##  $ private_tutoring            : chr [1:649] "No" "No" "No" "No" ...
##  $ extracurricular_activities  : chr [1:649] "No" "No" "No" "Yes" ...
##  $ attended_daycare            : chr [1:649] "Yes" "No" "Yes" "Yes" ...
##  $ desire_graduate_education   : chr [1:649] "Yes" "Yes" "Yes" "Yes" ...
##  $ has_internet                : chr [1:649] "No" "Yes" "Yes" "Yes" ...
##  $ is_dating                   : chr [1:649] "No" "No" "No" "Yes" ...
##  $ good_family_relationship    : chr [1:649] "Good" "Good" "Good" "Fair" ...
##  $ free_time_after_school      : chr [1:649] "Moderate" "Moderate" "Moderate" "Low" ...
##  $ time_with_friends           : chr [1:649] "High" "Moderate" "Low" "Low" ...
##  $ alcohol_weekdays            : chr [1:649] "Low" "Low" "Low" "Low" ...
##  $ alcohol_weekends            : chr [1:649] "Low" "Low" "Moderate" "Low" ...
##  $ health_status               : chr [1:649] "Fair" "Fair" "Fair" "Good" ...
##  $ school_absence              : num [1:649] 4 2 6 0 0 6 0 2 0 0 ...
##  $ grade_1st_semester          : num [1:649] 0 9 12 14 11 12 13 10 15 12 ...
##  $ grade_2nd_semester          : num [1:649] 11 11 13 14 13 12 12 13 16 12 ...
# dimensiones de la base de datos
dim(base_datos_clean)
## [1] 649  31
# resumen general de la base de datos

summary(base_datos_clean)
##     school             gender               age        housing_type      
##  Length:649         Length:649         Min.   :15.00   Length:649        
##  Class :character   Class :character   1st Qu.:16.00   Class :character  
##  Mode  :character   Mode  :character   Median :17.00   Mode  :character  
##                                        Mean   :16.74                     
##                                        3rd Qu.:18.00                     
##                                        Max.   :22.00                     
##  family_size        parental_status    mother_education   father_education  
##  Length:649         Length:649         Length:649         Length:649        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  mother_work        father_work        reason_school_choice
##  Length:649         Length:649         Length:649          
##  Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character    
##                                                            
##                                                            
##                                                            
##  legal_responsibility commute_time       weekly_study_time 
##  Length:649           Length:649         Length:649        
##  Class :character     Class :character   Class :character  
##  Mode  :character     Mode  :character   Mode  :character  
##                                                            
##                                                            
##                                                            
##  extra_educational_support parental_educational_support private_tutoring  
##  Length:649                Length:649                   Length:649        
##  Class :character          Class :character             Class :character  
##  Mode  :character          Mode  :character             Mode  :character  
##                                                                           
##                                                                           
##                                                                           
##  extracurricular_activities attended_daycare   desire_graduate_education
##  Length:649                 Length:649         Length:649               
##  Class :character           Class :character   Class :character         
##  Mode  :character           Mode  :character   Mode  :character         
##                                                                         
##                                                                         
##                                                                         
##  has_internet        is_dating         good_family_relationship
##  Length:649         Length:649         Length:649              
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##  free_time_after_school time_with_friends  alcohol_weekdays  
##  Length:649             Length:649         Length:649        
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##  alcohol_weekends   health_status      school_absence   grade_1st_semester
##  Length:649         Length:649         Min.   : 0.000   Min.   : 0.0      
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.:10.0      
##  Mode  :character   Mode  :character   Median : 2.000   Median :11.0      
##                                        Mean   : 3.659   Mean   :11.4      
##                                        3rd Qu.: 6.000   3rd Qu.:13.0      
##                                        Max.   :32.000   Max.   :19.0      
##  grade_2nd_semester
##  Min.   : 0.00     
##  1st Qu.:10.00     
##  Median :11.00     
##  Mean   :11.57     
##  3rd Qu.:13.00     
##  Max.   :19.00
# explora data
skimr::skim(base_datos_clean) 
Data summary
Name base_datos_clean
Number of rows 649
Number of columns 31
_______________________
Column type frequency:
character 27
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
school 0 1 15 20 0 2 0
gender 0 1 4 6 0 2 0
housing_type 0 1 5 5 0 2 0
family_size 0 1 7 7 0 2 0
parental_status 0 1 9 15 0 2 0
mother_education 0 1 4 22 0 5 0
father_education 0 1 4 22 0 5 0
mother_work 0 1 5 9 0 5 0
father_work 0 1 5 9 0 5 0
reason_school_choice 0 1 5 17 0 4 0
legal_responsibility 0 1 5 6 0 3 0
commute_time 0 1 12 12 0 4 0
weekly_study_time 0 1 7 13 0 4 0
extra_educational_support 0 1 2 3 0 2 0
parental_educational_support 0 1 2 3 0 2 0
private_tutoring 0 1 2 3 0 2 0
extracurricular_activities 0 1 2 3 0 2 0
attended_daycare 0 1 2 3 0 2 0
desire_graduate_education 0 1 2 3 0 2 0
has_internet 0 1 2 3 0 2 0
is_dating 0 1 2 3 0 2 0
good_family_relationship 0 1 4 4 0 3 0
free_time_after_school 0 1 3 8 0 3 0
time_with_friends 0 1 3 8 0 3 0
alcohol_weekdays 0 1 3 8 0 3 0
alcohol_weekends 0 1 3 8 0 3 0
health_status 0 1 4 4 0 3 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 16.74 1.22 15 16 17 18 22 ▇▅▅▁▁
school_absence 0 1 3.66 4.64 0 0 2 6 32 ▇▂▁▁▁
grade_1st_semester 0 1 11.40 2.75 0 10 11 13 19 ▁▂▇▇▁
grade_2nd_semester 0 1 11.57 2.91 0 10 11 13 19 ▁▁▇▇▂
# resumen de la base de datos dado por escuela
by(base_datos_clean, base_datos_clean$school, summary)
## base_datos_clean$school: Gabriel Pereira
##     school             gender               age        housing_type      
##  Length:423         Length:423         Min.   :15.00   Length:423        
##  Class :character   Class :character   1st Qu.:16.00   Class :character  
##  Mode  :character   Mode  :character   Median :17.00   Mode  :character  
##                                        Mean   :16.67                     
##                                        3rd Qu.:18.00                     
##                                        Max.   :22.00                     
##  family_size        parental_status    mother_education   father_education  
##  Length:423         Length:423         Length:423         Length:423        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  mother_work        father_work        reason_school_choice
##  Length:423         Length:423         Length:423          
##  Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character    
##                                                            
##                                                            
##                                                            
##  legal_responsibility commute_time       weekly_study_time 
##  Length:423           Length:423         Length:423        
##  Class :character     Class :character   Class :character  
##  Mode  :character     Mode  :character   Mode  :character  
##                                                            
##                                                            
##                                                            
##  extra_educational_support parental_educational_support private_tutoring  
##  Length:423                Length:423                   Length:423        
##  Class :character          Class :character             Class :character  
##  Mode  :character          Mode  :character             Mode  :character  
##                                                                           
##                                                                           
##                                                                           
##  extracurricular_activities attended_daycare   desire_graduate_education
##  Length:423                 Length:423         Length:423               
##  Class :character           Class :character   Class :character         
##  Mode  :character           Mode  :character   Mode  :character         
##                                                                         
##                                                                         
##                                                                         
##  has_internet        is_dating         good_family_relationship
##  Length:423         Length:423         Length:423              
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##  free_time_after_school time_with_friends  alcohol_weekdays  
##  Length:423             Length:423         Length:423        
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##  alcohol_weekends   health_status      school_absence   grade_1st_semester
##  Length:423         Length:423         Min.   : 0.000   Min.   : 0.00     
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.:10.00     
##  Mode  :character   Mode  :character   Median : 2.000   Median :12.00     
##                                        Mean   : 4.215   Mean   :11.99     
##                                        3rd Qu.: 6.000   3rd Qu.:14.00     
##                                        Max.   :32.000   Max.   :18.00     
##  grade_2nd_semester
##  Min.   : 6.00     
##  1st Qu.:10.00     
##  Median :12.00     
##  Mean   :12.14     
##  3rd Qu.:14.00     
##  Max.   :19.00     
## ------------------------------------------------------------ 
## base_datos_clean$school: Mousinho da Silveira
##     school             gender               age        housing_type      
##  Length:226         Length:226         Min.   :15.00   Length:226        
##  Class :character   Class :character   1st Qu.:16.00   Class :character  
##  Mode  :character   Mode  :character   Median :17.00   Mode  :character  
##                                        Mean   :16.89                     
##                                        3rd Qu.:18.00                     
##                                        Max.   :20.00                     
##  family_size        parental_status    mother_education   father_education  
##  Length:226         Length:226         Length:226         Length:226        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  mother_work        father_work        reason_school_choice
##  Length:226         Length:226         Length:226          
##  Class :character   Class :character   Class :character    
##  Mode  :character   Mode  :character   Mode  :character    
##                                                            
##                                                            
##                                                            
##  legal_responsibility commute_time       weekly_study_time 
##  Length:226           Length:226         Length:226        
##  Class :character     Class :character   Class :character  
##  Mode  :character     Mode  :character   Mode  :character  
##                                                            
##                                                            
##                                                            
##  extra_educational_support parental_educational_support private_tutoring  
##  Length:226                Length:226                   Length:226        
##  Class :character          Class :character             Class :character  
##  Mode  :character          Mode  :character             Mode  :character  
##                                                                           
##                                                                           
##                                                                           
##  extracurricular_activities attended_daycare   desire_graduate_education
##  Length:226                 Length:226         Length:226               
##  Class :character           Class :character   Class :character         
##  Mode  :character           Mode  :character   Mode  :character         
##                                                                         
##                                                                         
##                                                                         
##  has_internet        is_dating         good_family_relationship
##  Length:226         Length:226         Length:226              
##  Class :character   Class :character   Class :character        
##  Mode  :character   Mode  :character   Mode  :character        
##                                                                
##                                                                
##                                                                
##  free_time_after_school time_with_friends  alcohol_weekdays  
##  Length:226             Length:226         Length:226        
##  Class :character       Class :character   Class :character  
##  Mode  :character       Mode  :character   Mode  :character  
##                                                              
##                                                              
##                                                              
##  alcohol_weekends   health_status      school_absence   grade_1st_semester
##  Length:226         Length:226         Min.   : 0.000   Min.   : 4.0      
##  Class :character   Class :character   1st Qu.: 0.000   1st Qu.: 8.0      
##  Mode  :character   Mode  :character   Median : 2.000   Median :10.0      
##                                        Mean   : 2.619   Mean   :10.3      
##                                        3rd Qu.: 4.000   3rd Qu.:12.0      
##                                        Max.   :12.000   Max.   :19.0      
##  grade_2nd_semester
##  Min.   : 0.00     
##  1st Qu.: 9.00     
##  Median :10.00     
##  Mean   :10.50     
##  3rd Qu.:12.75     
##  Max.   :18.00

Variables cuantitativas

# crear un dataframe con sólo las columnas con valores numéricos
base_datos_num <- base_datos_clean %>% select_if(is.numeric)
base_datos_num
## # A tibble: 649 × 4
##      age school_absence grade_1st_semester grade_2nd_semester
##    <dbl>          <dbl>              <dbl>              <dbl>
##  1    18              4                  0                 11
##  2    17              2                  9                 11
##  3    15              6                 12                 13
##  4    15              0                 14                 14
##  5    16              0                 11                 13
##  6    16              6                 12                 12
##  7    16              0                 13                 12
##  8    17              2                 10                 13
##  9    15              0                 15                 16
## 10    15              0                 12                 12
## # ℹ 639 more rows

Rango Intercuartil

# obtener el rango intercuartil de cada columna numérica

rango_intercuantil <- lapply(base_datos_num, IQR)
rango_intercuantil
## $age
## [1] 2
## 
## $school_absence
## [1] 6
## 
## $grade_1st_semester
## [1] 3
## 
## $grade_2nd_semester
## [1] 3

Desviación Estándar

# obtener la desviación estándar

desviacion_estandar <- lapply(base_datos_num, sd)
desviacion_estandar
## $age
## [1] 1.218138
## 
## $school_absence
## [1] 4.640759
## 
## $grade_1st_semester
## [1] 2.745265
## 
## $grade_2nd_semester
## [1] 2.913639

Varianza

# obtener la varianza

varianza <- lapply(base_datos_num, var)
varianza
## $age
## [1] 1.483859
## 
## $school_absence
## [1] 21.53664
## 
## $grade_1st_semester
## [1] 7.536481
## 
## $grade_2nd_semester
## [1] 8.48929

Estadísticas más Específicas

# brinda estadísticas más específicas 
estadisticas <- stat.desc(base_datos_num)
estadisticas
##                       age school_absence grade_1st_semester grade_2nd_semester
## nbr.val      6.490000e+02    649.0000000        649.0000000        649.0000000
## nbr.null     0.000000e+00    244.0000000          1.0000000          7.0000000
## nbr.na       0.000000e+00      0.0000000          0.0000000          0.0000000
## min          1.500000e+01      0.0000000          0.0000000          0.0000000
## max          2.200000e+01     32.0000000         19.0000000         19.0000000
## range        7.000000e+00     32.0000000         19.0000000         19.0000000
## sum          1.086700e+04   2375.0000000       7398.0000000       7509.0000000
## median       1.700000e+01      2.0000000         11.0000000         11.0000000
## mean         1.674422e+01      3.6594761         11.3990755         11.5701079
## SE.mean      4.781608e-02      0.1821657          0.1077611          0.1143703
## CI.mean.0.95 9.389318e-02      0.3577064          0.2116031          0.2245812
## var          1.483859e+00     21.5366423          7.5364806          8.4892903
## std.dev      1.218138e+00      4.6407588          2.7452651          2.9136387
## coef.var     7.274973e-02      1.2681484          0.2408323          0.2518247

Correlaciones y sus representaciones gráficas

# obtener el coeficiente de correlación con la columna de Edad

corr_edad <- lapply(base_datos_num, function(x) cor(x, base_datos_num$age))
corr_edad
## $age
## [1] 1
## 
## $school_absence
## [1] 0.1499982
## 
## $grade_1st_semester
## [1] -0.1743222
## 
## $grade_2nd_semester
## [1] -0.1071191
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
  column_names = names(corr_edad),
  correlation = unlist(corr_edad)
)

# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
  geom_bar(stat = "identity") +
  labs(title = "Correlación con Edad", x = "Columnas", y = "Correlación") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)

# obtener el coeficiente de correlación con la columna de ausencias

corr_ausencias <- lapply(base_datos_num, function(x) cor(x, base_datos_num$school_absence))
corr_ausencias
## $age
## [1] 0.1499982
## 
## $school_absence
## [1] 1
## 
## $grade_1st_semester
## [1] -0.1471492
## 
## $grade_2nd_semester
## [1] -0.1247449
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
  column_names = names(corr_ausencias),
  correlation = unlist(corr_ausencias)
)

# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
  geom_bar(stat = "identity") +
  labs(title = "Correlación con Ausencias", x = "Columnas", y = "Correlación") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)

# obtener el coeficiente de correlación con la columna de notas primer semestre

corr_notas_primer_sem <- lapply(base_datos_num, function(x) cor(x, base_datos_num$grade_1st_semester))
corr_notas_primer_sem
## $age
## [1] -0.1743222
## 
## $school_absence
## [1] -0.1471492
## 
## $grade_1st_semester
## [1] 1
## 
## $grade_2nd_semester
## [1] 0.8649816
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
  column_names = names(corr_notas_primer_sem),
  correlation = unlist(corr_notas_primer_sem)
)

# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
  geom_bar(stat = "identity") +
  labs(title = "Correlación con Notas Primer Semestre", x = "Columnas", y = "Correlación") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)

# obtener el coeficiente de correlación con la columna de notas segundo semestre

corr_notas_segundo_sem <- lapply(base_datos_num, function(x) cor(x, base_datos_num$grade_2nd_semester))
corr_notas_segundo_sem
## $age
## [1] -0.1071191
## 
## $school_absence
## [1] -0.1247449
## 
## $grade_1st_semester
## [1] 0.8649816
## 
## $grade_2nd_semester
## [1] 1
# convertir las correlaciones a un dataframe para mayor conveniencia
corr_edad <- data.frame(
  column_names = names(corr_notas_segundo_sem),
  correlation = unlist(corr_notas_segundo_sem)
)

# representación gráfica
ggplot(corr_edad, aes(x = column_names, y = correlation, fill = correlation)) +
  geom_bar(stat = "identity") +
  labs(title = "Correlación con Notas Segundo Semestre", x = "Columnas", y = "Correlación") +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0)

Histogramas

# crea un histograma para cada columna cuantitativa

lapply(names(base_datos_num), function(col_name) {
  col <- base_datos_num[[col_name]]
  ggplot(data.frame(col), aes(x = col)) +
    geom_histogram(binwidth = 1, fill = "blue") +
    labs(title = col_name, x = col_name, y = "Frequencia")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Box Plots

# crear gráficos boxplots para cada columna cuantitativa

lapply(names(base_datos_num), function(col_name) {
  ggplot(base_datos_num, aes_string(x = col_name)) +
    geom_boxplot(outlier.colour="black", outlier.shape=16,
                 outlier.size=2, notch=FALSE)
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

# Box Plot de la columna de edad y todas las otras columnas cuantitativas

lapply(names(base_datos_num)[-which(names(base_datos_num) == "age")], function(col_name) {
  boxplot(base_datos_num$age ~ base_datos_num[[col_name]], main = paste("Edad y", col_name))
})

## [[1]]
## [[1]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]   15   15   15 15.0   15   16   15 15.0   15    18    16    15    16    18
## [2,]   16   16   16 16.5   16   17   16 15.5   16    18    17    15    16    18
## [3,]   17   16   17 18.0   17   17   16 16.0   17    18    17    15    17    18
## [4,]   17   17   17 18.0   18   18   17 16.0   18    18    18    16    19    18
## [5,]   18   18   18 18.0   19   19   18 16.0   20    18    19    16    22    18
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,]  16.0  17.0    15  17.0    17    17    15    18    17    17
## [2,]  16.5  17.0    16  17.0    17    17    15    18    17    17
## [3,]  17.0  17.5    17  17.0    19    17    15    18    17    17
## [4,]  17.0  18.0    18  17.5    21    17    15    18    17    17
## [5,]  17.0  18.0    19  18.0    21    17    15    18    17    17
## 
## [[1]]$n
##  [1] 244  12 110   7  93  12  49   3  42   7  21   5  12   1   8   2  10   3   2
## [20]   2   1   1   1   1
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 16.89885 15.54389 16.84935 17.10422 16.67232 16.54389 15.77429 15.54389
## [2,] 17.10115 16.45611 17.15065 18.89578 17.32768 17.45611 16.22571 16.45611
##         [,9] [,10]    [,11]   [,12]    [,13] [,14]    [,15]    [,16]    [,17]
## [1,] 16.5124    18 16.65522 14.2934 15.63168    18 16.72069 16.38277 16.00072
## [2,] 17.4876    18 17.34478 15.7066 18.36832    18 17.27931 18.61723 17.99928
##         [,18]    [,19] [,20] [,21] [,22] [,23] [,24]
## [1,] 16.54389 14.53109    17    15    18    17    17
## [2,] 17.45611 23.46891    17    15    18    17    17
## 
## [[1]]$out
##  [1] 19 19 19 19 20 19 19 21 19 19 19 19 19 19 19 19 19 20 19 19 19 17 15 15 18
## [26] 18
## 
## [[1]]$group
##  [1]  1  1  1  1  1  1  1  1  1  1  1  1  1  1  3  3  3  6  7  7 10 10 11 11 12
## [26] 15
## 
## [[1]]$names
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
## 
## 
## [[2]]
## [[2]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]   18   18   18   15   15   16   15   15   15    15    15    15    15    15
## [2,]   18   18   18   16   16   17   16   16   16    16    15    16    16    16
## [3,]   18   18   18   17   17   17   17   17   17    16    16    16    17    16
## [4,]   18   18   19   18   18   18   18   18   17    17    17    18    17    18
## [5,]   18   18   19   19   19   19   21   20   18    18    18    20    18    18
##      [,15] [,16] [,17]
## [1,]  16.0    17    18
## [2,]  16.0    17    18
## [3,]  17.0    17    18
## [4,]  17.5    18    18
## [5,]  18.0    18    18
## 
## [[2]]$n
##  [1]  1  2  5  9 33 42 65 95 91 82 72 71 35 22 16  7  1
## 
## [[2]]$conf
##      [,1] [,2]    [,3]     [,4]     [,5]    [,6]     [,7]     [,8]     [,9]
## [1,]   18   18 17.2934 15.94667 16.44991 16.7562 16.60805 16.67579 16.83437
## [2,]   18   18 18.7066 18.05333 17.55009 17.2438 17.39195 17.32421 17.16563
##         [,10]    [,11]    [,12]    [,13]    [,14]   [,15]    [,16] [,17]
## [1,] 15.82552 15.62759 15.62498 16.73293 15.32629 16.4075 16.40282    18
## [2,] 16.17448 16.37241 16.37502 17.26707 16.67371 17.5925 17.59718    18
## 
## [[2]]$out
##  [1] 16 22 15 15 20 20 19 19 19 19 19 20 20 19 19 15
## 
## [[2]]$group
##  [1]  3  5  6  6  6  6  9  9  9  9  9  9  9 10 10 16
## 
## [[2]]$names
##  [1] "0"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"
## 
## 
## [[3]]
## [[3]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 16.0   16 16.0   15   15   15   15   15 15.0    15    15    15    15  15.0
## [2,] 17.5   17 16.5   16   16   16   16   16 16.0    16    15    16    16  16.0
## [3,] 18.0   18 17.0   17   17   17   17   16 16.5    16    16    17    17  17.5
## [4,] 18.5   18 18.0   18   18   18   18   17 17.0    17    17    17    18  18.0
## [5,] 19.0   18 18.0   18   19   20   21   18 18.0    18    18    18    18  18.0
##      [,15] [,16]
## [1,]    17    17
## [2,]    17    17
## [3,]    17    17
## [4,]    18    17
## [5,]    18    17
## 
## [[3]]$n
##  [1]   7   3   7  16  40  72  83 103  86  80  54  38  25  20  14   1
## 
## [[3]]$conf
##          [,1]     [,2]     [,3]  [,4]     [,5]     [,6]     [,7]     [,8]
## [1,] 17.40282 17.08779 16.10422 16.21 16.50036 16.62759 16.65314 15.84432
## [2,] 18.59718 18.91221 17.89578 17.79 17.49964 17.37241 17.34686 16.15568
##          [,9]    [,10]    [,11]    [,12]  [,13]   [,14]    [,15] [,16]
## [1,] 16.32962 15.82335 15.56998 16.74369 16.368 16.7934 16.57773    17
## [2,] 16.67038 16.17665 16.43002 17.25631 17.632 18.2066 17.42227    17
## 
## [[3]]$out
##  [1] 22 19 19 19 20 19 19 20 19 21 19 19 19 20 20
## 
## [[3]]$group
##  [1]  5  8  8  8  8  8  9  9  9  9 10 10 10 12 12
## 
## [[3]]$names
##  [1] "0"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de ausencias y todas las otras columnas cuantitativas

lapply(names(base_datos_num)[-which(names(base_datos_num) == "school_absence")], function(col_name) {
  boxplot(base_datos_num$school_absence ~ base_datos_num[[col_name]], main = paste("Ausencia y", col_name))
})

## [[1]]
## [[1]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    0    0    0    0    0    5  0.0   12
## [2,]    0    0    0    0    0    5  0.0   12
## [3,]    2    2    2    3    4    8 10.5   12
## [4,]    4    4    6    7    6    8 21.0   12
## [5,]   10   10   15   16   12   12 21.0   12
## 
## [[1]]$n
## [1] 112 177 179 140  32   6   2   1
## 
## [[1]]$conf
##          [,1]    [,2]     [,3]     [,4]     [,5]     [,6]     [,7] [,8]
## [1,] 1.402816 1.52496 1.291432 2.065259 2.324157 6.064903 -12.9618   12
## [2,] 2.597184 2.47504 2.708568 3.934741 5.675843 9.935097  33.9618   12
## 
## [[1]]$out
##  [1] 16 24 11 11 11 16 14 14 12 12 12 12 16 16 11 22 32 16 30 21 16 22 18 18 18
## [26] 26 16 16  0
## 
## [[1]]$group
##  [1] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 4 4 5 5 6
## 
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
## 
## 
## [[2]]
## [[2]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    4  0.0    0    0    0    0    0    0  0.0     0     0     0     0   0.0
## [2,]    4  0.0    0    0    0    0    1    0  0.0     0     0     0     0   0.0
## [3,]    4  4.5    0    2    4    4    4    2  2.0     2     2     2     2   0.5
## [4,]    4  9.0    2    4    8    8    6    7  5.5     6     4     4     5   4.0
## [5,]    4  9.0    2    6   12   16   12   16 10.0    15    10    10    12  10.0
##      [,15] [,16] [,17]
## [1,]     0     0     0
## [2,]     0     0     0
## [3,]     0     0     0
## [4,]     2     0     0
## [5,]     4     0     0
## 
## [[2]]$n
##  [1]  1  2  5  9 33 42 65 95 91 82 72 71 35 22 16  7  1
## 
## [[2]]$conf
##      [,1]      [,2]      [,3]       [,4]     [,5]     [,6]     [,7]      [,8]
## [1,]    4 -5.555058 -1.413195 -0.1066667 1.799658 2.049606 3.020126 0.8652679
## [2,]    4 14.555058  1.413195  4.1066667 6.200342 5.950394 4.979874 3.1347321
##         [,9]     [,10]    [,11]    [,12]     [,13]      [,14] [,15] [,16] [,17]
## [1,] 1.08904 0.9531091 1.255181 1.249954 0.6646563 -0.8474285 -0.79     0     0
## [2,] 2.91096 3.0468909 2.744819 2.750046 3.3353437  1.8474285  0.79     0     0
## 
## [[2]]$out
##  [1]  8 12 26 24 22 16 14 16 14 21 18 18 16 16 16 16 22 12 21 15 18 13 32 30 14
## [26]  6 10
## 
## [[2]]$group
##  [1]  3  4  5  7  7  7  7  7  7  7  8  8  9  9  9  9  9 11 11 11 11 11 12 12 13
## [26] 15 16
## 
## [[2]]$names
##  [1] "0"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"
## 
## 
## [[3]]
## [[3]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    0    0  0.0  0.0    0    0    0  0.0    0     0     0     0     0   0.0
## [2,]    0    4  0.0  0.0    1    2    0  0.5    0     0     0     0     0   0.0
## [3,]    0    8  2.0  4.5    4    4    2  4.0    2     2     2     2     2   0.0
## [4,]    0    8  2.5 11.0    8    8    6  6.0    4     4     4     4     4   2.5
## [5,]    0    8  4.0 22.0   16   16   12 14.0    8    10     9    10    10   6.0
##      [,15] [,16]
## [1,]     0     0
## [2,]     0     0
## [3,]     0     0
## [4,]     0     0
## [5,]     0     0
## 
## [[3]]$n
##  [1]   7   3   7  16  40  72  83 103  86  80  54  38  25  20  14   1
## 
## [[3]]$conf
##      [,1]      [,2]      [,3]  [,4]    [,5]     [,6]      [,7]     [,8]
## [1,]    0  4.351146 0.5070403 0.155 2.25126 2.882771 0.9594348 3.143749
## [2,]    0 11.648854 3.4929597 8.845 5.74874 5.117229 3.0405652 4.856251
##          [,9]    [,10]    [,11]     [,12] [,13]      [,14] [,15] [,16]
## [1,] 1.318497 1.293403 1.139957 0.9747606 0.736 -0.8832469     0     0
## [2,] 2.681503 2.706597 2.860043 3.0252394 3.264  0.8832469     0     0
## 
## [[3]]$out
##  [1] 24 26 16 18 21 16 16 16 22 16 15 12 16 12 18 12 32 21 18 11 14 13 30 10 10
## [26]  2 10  4
## 
## [[3]]$group
##  [1]  5  5  7  7  7  8  8  8  8  8  9  9  9  9  9 10 10 10 10 10 11 11 12 14 14
## [26] 15 15 15
## 
## [[3]]$names
##  [1] "0"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de nota primer semestre y todas las otras columnas cuantitativas

lapply(names(base_datos_num)[-which(names(base_datos_num) == "grade_1st_semester")], function(col_name) {
  boxplot(base_datos_num$grade_1st_semester ~ base_datos_num[[col_name]], main = paste("Nota Primer Semestre y", col_name))
})

## [[1]]
## [[1]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]    6    5    6    4  5.0  8.0    9    7
## [2,]   10   10   10    9  7.5  8.0    9    7
## [3,]   12   12   11   11  9.0 10.5    9    7
## [4,]   13   14   13   14 10.5 11.0    9    7
## [5,]   16   17   17   19 14.0 14.0    9    7
## 
## [[1]]$n
## [1] 112 177 179 140  32   6   2   1
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]     [,5]      [,6] [,7] [,8]
## [1,] 11.55211 11.52496 10.64572 10.33233 8.162078  8.564903    9    7
## [2,] 12.44789 12.47504 11.35428 11.66767 9.837922 12.435097    9    7
## 
## [[1]]$out
## [1] 18 18 18 18  0
## 
## [[1]]$group
## [1] 1 3 3 3 4
## 
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
## 
## 
## [[2]]
## [[2]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    4  9.0  5.0  7.0    6  8.0    6  7.0    5   4.0     7     9   6.0    13
## [2,]   10 10.5 10.0  8.0    9  9.0   10 10.5    9   7.0    10     9   7.5    13
## [3,]   12 12.0 11.5  9.0   11  9.5   12 14.0   10   9.0    12    10   8.5    13
## [4,]   14 13.0 14.0 10.5   13 12.5   13 14.5   12  10.5    12    12  12.0    13
## [5,]   19 16.0 17.0 11.0   17 14.0   17 15.0   15  13.0    14    12  15.0    13
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,]   8.0  12.0     8  10.0     9     9     9     7    14    14
## [2,]   8.5  12.0     9  10.0     9     9     9     7    14    14
## [3,]   9.5  12.5    10  10.0    11    10     9     7    14    14
## [4,]  12.0  13.0    11  11.5    13    11     9     7    14    14
## [5,]  15.0  13.0    11  13.0    13    11     9     7    14    14
## 
## [[2]]$n
##  [1] 244  12 110   7  93  12  49   3  42   7  21   5  12   1   8   2  10   3   2
## [20]   2   1   1   1   1
## 
## [[2]]$conf
##         [,1]     [,2]     [,3]     [,4]     [,5]      [,6]     [,7]     [,8]
## [1,] 11.5954 10.85973 10.89741  7.50704 10.34465  7.903627 11.32286 10.35115
## [2,] 12.4046 13.14027 12.10259 10.49296 11.65535 11.096373 12.67714 17.64885
##           [,9]     [,10]    [,11]     [,12]    [,13] [,14]    [,15]    [,16]
## [1,]  9.268602  6.909856 11.31043  7.880208  6.44752    13  7.54485 11.38277
## [2,] 10.731398 11.090144 12.68957 12.119792 10.55248    13 11.45515 13.61723
##         [,17]    [,18]     [,19]     [,20] [,21] [,22] [,23] [,24]
## [1,]  9.00072  8.63168  6.531085  7.765543     9     7    14    14
## [2,] 10.99928 11.36832 15.468915 12.234457     9     7    14    14
## 
## [[2]]$out
## [1] 17  0 18 16 16
## 
## [[2]]$group
## [1]  4  5 11 11 11
## 
## [[2]]$names
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
## 
## 
## [[3]]
## [[3]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]  4.0  5.0  5.0  6.0    4    6    7    9    9    10    13    13    14  13.0
## [2,]  5.0  6.0  6.5  7.0    7    8    9   10   11    12    13    14    15  14.5
## [3,]  7.0  7.0  7.0  7.5    8    9   10   11   12    13    14    14    15  16.0
## [4,]  7.5  7.5  7.5  8.0    9   10   11   11   13    14    14    15    16  17.0
## [5,]  9.0  8.0  8.0  9.0   10   12   13   12   15    15    15    16    17  19.0
##      [,15] [,16]
## [1,]    16    18
## [2,]    17    18
## [3,]    17    18
## [4,]    18    18
## [5,]    18    18
## 
## [[3]]$n
##  [1]   7   3   7  16  40  72  83 103  86  80  54  38  25  20  14   1
## 
## [[3]]$conf
##         [,1]    [,2]     [,3]  [,4]    [,5]    [,6]      [,7]     [,8]     [,9]
## [1,] 5.50704 5.63168 6.402816 7.105 7.50036 8.62759  9.653145 10.84432 11.65925
## [2,] 8.49296 8.36832 7.597184 7.895 8.49964 9.37241 10.346855 11.15568 12.34075
##        [,10]    [,11]    [,12]  [,13]    [,14]    [,15] [,16]
## [1,] 12.6467 13.78499 13.74369 14.684 15.11675 16.57773    18
## [2,] 13.3533 14.21501 14.25631 15.316 16.88325 17.42227    18
## 
## [[3]]$out
##  [1]  0 13 13 13 13  7 14  6  8  8 16 16 12 12 11 12
## 
## [[3]]$group
##  [1]  8  8  8  8  8  8  8  8  8 10 11 11 12 12 12 13
## 
## [[3]]$names
##  [1] "0"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [16] "19"
# Box Plot de la columna de nota segundo semestre y todas las otras columnas cuantitativas

lapply(names(base_datos_num)[-which(names(base_datos_num) == "grade_2nd_semester")], function(col_name) {
  boxplot(base_datos_num$grade_2nd_semester ~ base_datos_num[[col_name]], main = paste("Nota Segundo Semestre y", col_name))
})

## [[1]]
## [[1]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,]  7.0    6    6    5  8.0  9.0   10    8
## [2,] 10.5   10   10    9  8.5 10.0   10    8
## [3,] 12.0   12   12   11 10.0 11.5   11    8
## [4,] 13.0   13   14   14 11.0 15.0   12    8
## [5,] 16.0   17   19   18 13.0 15.0   12    8
## 
## [[1]]$n
## [1] 112 177 179 140  32   6   2   1
## 
## [[1]]$conf
##          [,1]     [,2]     [,3]     [,4]      [,5]      [,6]      [,7] [,8]
## [1,] 11.62676 11.64372 11.52762 10.33233  9.301732  8.274839  8.765543    8
## [2,] 12.37324 12.35628 12.47238 11.66767 10.698268 14.725161 13.234457    8
## 
## [[1]]$out
## [1] 17  0  5  0  0  0  0  0  0
## 
## [[1]]$group
## [1] 1 2 2 3 4 4 4 5 5
## 
## [[1]]$names
## [1] "15" "16" "17" "18" "19" "20" "21" "22"
## 
## 
## [[2]]
## [[2]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]    5  9.0    6    6    6  7.0    7  9.0    5   7.0     7    11   7.0    14
## [2,]   10 10.0   10    8   10  9.5   10 11.5    9   8.0    10    11   8.5    14
## [3,]   12 11.5   12   10   11 11.0   12 14.0   11   9.0    11    11  10.0    14
## [4,]   14 13.5   13   12   13 13.5   13 15.0   13   9.5    13    11  11.5    14
## [5,]   19 16.0   17   17   16 17.0   17 16.0   15  10.0    17    11  13.0    14
##      [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24]
## [1,]   7.0   9.0   7.0  10.0  10.0     7     8     8    15    13
## [2,]   8.5   9.0   9.0  11.0  10.0     7     8     8    15    13
## [3,]  10.0  10.5  10.5  12.0  11.5     9     8     8    15    13
## [4,]  11.0  12.0  11.0  12.5  13.0    11     8     8    15    13
## [5,]  14.0  12.0  12.0  13.0  13.0    11     8     8    15    13
## 
## [[2]]$n
##  [1] 244  12 110   7  93  12  49   3  42   7  21   5  12   1   8   2  10   3   2
## [20]   2   1   1   1   1
## 
## [[2]]$conf
##         [,1]      [,2]     [,3]      [,4]     [,5]      [,6]     [,7]     [,8]
## [1,] 11.5954  9.903627 11.54806  7.611265 10.50848  9.175573 11.32286 10.80725
## [2,] 12.4046 13.096373 12.45194 12.388735 11.49152 12.824427 12.67714 17.19275
##         [,9]    [,10]     [,11] [,12]    [,13] [,14]     [,15]     [,16]
## [1,] 10.0248 8.104224  9.965647    11  8.63168    14  8.603464  7.148314
## [2,] 11.9752 9.895776 12.034353    11 11.36832    14 11.396536 13.851686
##         [,17]    [,18]     [,19]     [,20] [,21] [,22] [,23] [,24]
## [1,]  9.50072 10.63168  8.148314  4.531085     8     8    15    13
## [2,] 11.49928 13.36832 14.851686 13.468915     8     8    15    13
## 
## [[2]]$out
##  [1]  0  0  0  0  0  0  0 18 18 14 18 10 13
## 
## [[2]]$group
##  [1]  1  1  1  1  1  1  1  3  5 10 11 12 12
## 
## [[2]]$names
##  [1] "0"  "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14"
## [16] "15" "16" "18" "21" "22" "24" "26" "30" "32"
## 
## 
## [[3]]
## [[3]]$stats
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]   11    0    0    6    5    7    8    8    9     9    10    11    12    14
## [2,]   11    0    0    7    7    8    9    9   10    11    12    13    14    15
## [3,]   11    4    5    8    8    9   10   10   11    12    13    14    15    16
## [4,]   11    8    6    8    9    9   10   11   12    13    14    15    16    17
## [5,]   11    8    8    9   11   10   11   13   15    16    17    17    17    18
##      [,15] [,16] [,17]
## [1,]  16.0    18    17
## [2,]  17.0    18    17
## [3,]  17.5    18    17
## [4,]  18.0    18    17
## [5,]  18.0    18    17
## 
## [[3]]$n
##  [1]  1  2  5  9 33 42 65 95 91 82 72 71 35 22 16  7  1
## 
## [[3]]$conf
##      [,1]     [,2]      [,3]     [,4]     [,5]     [,6]      [,7]      [,8]
## [1,]   11 -4.93783 0.7604151 7.473333 7.449915 8.756201  9.804025  9.675791
## [2,]   11 12.93783 9.2395849 8.526667 8.550085 9.243799 10.195975 10.324209
##          [,9]    [,10]    [,11]    [,12]    [,13]    [,14]  [,15] [,16] [,17]
## [1,] 10.66874 11.65104 12.62759 13.62498 14.46586 15.32629 17.105    18    17
## [2,] 11.33126 12.34896 13.37241 14.37502 15.53414 16.67371 17.895    18    17
## 
## [[3]]$out
##  [1] 11  0  0  6  5 13 11  6  0  7 12 12  7  7  0 17 19
## 
## [[3]]$group
##  [1]  4  5  5  6  6  6  6  6  6  7  7  7  7  7  7 16 16
## 
## [[3]]$names
##  [1] "0"  "4"  "5"  "6"  "7"  "8"  "9"  "10" "11" "12" "13" "14" "15" "16" "17"
## [16] "18" "19"

Densidad

# crea un gráfico de densidad para cada columna cuantitativa

lapply(names(base_datos_num), function(col_name) {
  col <- base_datos_num[[col_name]]
  ggplot(data.frame(col), aes(x = col)) +
    geom_density() +
    labs(x = col_name)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

# crear gráficos de barra para cada columna cuantitativa

lapply(names(base_datos_num), function(col_name) {
  col <- base_datos_num[[col_name]]
  ggplot(data.frame(col), aes(x = col)) +
    geom_bar(stat = "count", fill = "darkred") +  
    labs(title = col_name, x = col_name, y = "")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Variables cualitativas

# crear un dataframe con sólo las columnas de string
base_datos_str <- base_datos_clean %>% select_if(is.character)
base_datos_str
## # A tibble: 649 × 27
##    school       gender housing_type family_size parental_status mother_education
##    <chr>        <chr>  <chr>        <chr>       <chr>           <chr>           
##  1 Gabriel Per… Female Urban        Above 3     Separated       Higher Education
##  2 Gabriel Per… Female Urban        Above 3     Living Together Primary School  
##  3 Gabriel Per… Female Urban        Up to 3     Living Together Primary School  
##  4 Gabriel Per… Female Urban        Above 3     Living Together Higher Education
##  5 Gabriel Per… Female Urban        Above 3     Living Together High School     
##  6 Gabriel Per… Male   Urban        Up to 3     Living Together Higher Education
##  7 Gabriel Per… Male   Urban        Up to 3     Living Together Lower Secondary…
##  8 Gabriel Per… Female Urban        Above 3     Separated       Higher Education
##  9 Gabriel Per… Male   Urban        Up to 3     Separated       High School     
## 10 Gabriel Per… Male   Urban        Above 3     Living Together High School     
## # ℹ 639 more rows
## # ℹ 21 more variables: father_education <chr>, mother_work <chr>,
## #   father_work <chr>, reason_school_choice <chr>, legal_responsibility <chr>,
## #   commute_time <chr>, weekly_study_time <chr>,
## #   extra_educational_support <chr>, parental_educational_support <chr>,
## #   private_tutoring <chr>, extracurricular_activities <chr>,
## #   attended_daycare <chr>, desire_graduate_education <chr>, …
count_df<- function(column) {
  count_data <- base_datos_str %>% count(!!sym(column))
  return(count_data)
}

countt <- lapply(names(base_datos_str), function(col) {
  count_df(col)
})

print(countt)
## [[1]]
## # A tibble: 2 × 2
##   school                   n
##   <chr>                <int>
## 1 Gabriel Pereira        423
## 2 Mousinho da Silveira   226
## 
## [[2]]
## # A tibble: 2 × 2
##   gender     n
##   <chr>  <int>
## 1 Female   383
## 2 Male     266
## 
## [[3]]
## # A tibble: 2 × 2
##   housing_type     n
##   <chr>        <int>
## 1 Rural          197
## 2 Urban          452
## 
## [[4]]
## # A tibble: 2 × 2
##   family_size     n
##   <chr>       <int>
## 1 Above 3       457
## 2 Up to 3       192
## 
## [[5]]
## # A tibble: 2 × 2
##   parental_status     n
##   <chr>           <int>
## 1 Living Together   569
## 2 Separated          80
## 
## [[6]]
## # A tibble: 5 × 2
##   mother_education           n
##   <chr>                  <int>
## 1 High School              139
## 2 Higher Education         175
## 3 Lower Secondary School   186
## 4 None                       6
## 5 Primary School           143
## 
## [[7]]
## # A tibble: 5 × 2
##   father_education           n
##   <chr>                  <int>
## 1 High School              131
## 2 Higher Education         128
## 3 Lower Secondary School   209
## 4 None                       7
## 5 Primary School           174
## 
## [[8]]
## # A tibble: 5 × 2
##   mother_work     n
##   <chr>       <int>
## 1 Health         48
## 2 Homemaker     135
## 3 Services      136
## 4 Teacher        72
## 5 other         258
## 
## [[9]]
## # A tibble: 5 × 2
##   father_work     n
##   <chr>       <int>
## 1 Health         23
## 2 Homemaker      42
## 3 Services      181
## 4 Teacher        36
## 5 other         367
## 
## [[10]]
## # A tibble: 4 × 2
##   reason_school_choice     n
##   <chr>                <int>
## 1 Course Preference      285
## 2 Near Home              149
## 3 Other                   72
## 4 Reputation             143
## 
## [[11]]
## # A tibble: 3 × 2
##   legal_responsibility     n
##   <chr>                <int>
## 1 Father                 153
## 2 Mother                 455
## 3 Other                   41
## 
## [[12]]
## # A tibble: 4 × 2
##   commute_time     n
##   <chr>        <int>
## 1 15 to 30 min   213
## 2 30 min to 1h    54
## 3 More than 1h    16
## 4 Up to 15 min   366
## 
## [[13]]
## # A tibble: 4 × 2
##   weekly_study_time     n
##   <chr>             <int>
## 1 2 to 5h             305
## 2 5 to 10h             97
## 3 More than 10h        35
## 4 Up to 2h            212
## 
## [[14]]
## # A tibble: 2 × 2
##   extra_educational_support     n
##   <chr>                     <int>
## 1 No                          581
## 2 Yes                          68
## 
## [[15]]
## # A tibble: 2 × 2
##   parental_educational_support     n
##   <chr>                        <int>
## 1 No                             251
## 2 Yes                            398
## 
## [[16]]
## # A tibble: 2 × 2
##   private_tutoring     n
##   <chr>            <int>
## 1 No                 610
## 2 Yes                 39
## 
## [[17]]
## # A tibble: 2 × 2
##   extracurricular_activities     n
##   <chr>                      <int>
## 1 No                           334
## 2 Yes                          315
## 
## [[18]]
## # A tibble: 2 × 2
##   attended_daycare     n
##   <chr>            <int>
## 1 No                 128
## 2 Yes                521
## 
## [[19]]
## # A tibble: 2 × 2
##   desire_graduate_education     n
##   <chr>                     <int>
## 1 No                           69
## 2 Yes                         580
## 
## [[20]]
## # A tibble: 2 × 2
##   has_internet     n
##   <chr>        <int>
## 1 No             151
## 2 Yes            498
## 
## [[21]]
## # A tibble: 2 × 2
##   is_dating     n
##   <chr>     <int>
## 1 No          410
## 2 Yes         239
## 
## [[22]]
## # A tibble: 3 × 2
##   good_family_relationship     n
##   <chr>                    <int>
## 1 Fair                       101
## 2 Good                       497
## 3 Poor                        51
## 
## [[23]]
## # A tibble: 3 × 2
##   free_time_after_school     n
##   <chr>                  <int>
## 1 High                     246
## 2 Low                      152
## 3 Moderate                 251
## 
## [[24]]
## # A tibble: 3 × 2
##   time_with_friends     n
##   <chr>             <int>
## 1 High                251
## 2 Low                 193
## 3 Moderate            205
## 
## [[25]]
## # A tibble: 3 × 2
##   alcohol_weekdays     n
##   <chr>            <int>
## 1 High                34
## 2 Low                572
## 3 Moderate            43
## 
## [[26]]
## # A tibble: 3 × 2
##   alcohol_weekends     n
##   <chr>            <int>
## 1 High               132
## 2 Low                397
## 3 Moderate           120
## 
## [[27]]
## # A tibble: 3 × 2
##   health_status     n
##   <chr>         <int>
## 1 Fair            124
## 2 Good            357
## 3 Poor            168

Gráficos de barra

# crear gráficos de barra para cada columna cualitativa

lapply(names(base_datos_str), function(col_name) {
  col <- base_datos_str[[col_name]]
  ggplot(data.frame(col), aes(x = col)) +
    geom_bar(stat = "count", fill = "darkred") +  
    labs(title = col_name, x = col_name, y = "")
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

Covariaciones

Variables cualitativas y cuantitativas

# Opened polygons
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekdays, group = alcohol_weekdays)) + 
  geom_density_ridges()
## Picking joint bandwidth of 0.747

# Opened polygons
ggplot(base_datos_clean, aes(x = grade_1st_semester, y = alcohol_weekends, group = alcohol_weekends)) + 
  geom_density_ridges()
## Picking joint bandwidth of 0.822

# Opened polygons
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekdays, group = alcohol_weekdays)) + 
  geom_density_ridges()
## Picking joint bandwidth of 0.678

# Opened polygons
ggplot(base_datos_clean, aes(x = grade_2nd_semester, y = alcohol_weekends, group = alcohol_weekends)) + 
  geom_density_ridges()
## Picking joint bandwidth of 0.856

ggplot(base_datos_clean, aes(x = alcohol_weekdays, y = grade_1st_semester)) +
  geom_boxplot()

ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) + 
  geom_freqpoly(aes(color = `alcohol_weekdays`), binwidth = 1, linewidth = 0.75)

ggplot(base_datos_clean, aes(x = alcohol_weekends, y = grade_1st_semester)) +
  geom_boxplot()

ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) + 
  geom_freqpoly(aes(color = `alcohol_weekends`), binwidth = 1, linewidth = 0.75)

ggplot(base_datos_clean, aes(x = alcohol_weekdays, y = grade_2nd_semester)) +
  geom_boxplot()

ggplot(base_datos_clean, aes(x = `grade_2nd_semester`)) + 
  geom_freqpoly(aes(color = `alcohol_weekdays`), binwidth = 1, linewidth = 0.75)

ggplot(base_datos_clean, aes(x = alcohol_weekends, y = grade_2nd_semester)) +
  geom_boxplot()

ggplot(base_datos_clean, aes(x = `grade_1st_semester`)) + 
  geom_freqpoly(aes(color = `alcohol_weekends`), binwidth = 1, linewidth = 0.75)

Dos variables categóricas

lapply(names(base_datos_str), function(col_name) {
  col <- base_datos_str[[col_name]]
  ggplot(data.frame(col), aes(x = base_datos_str$alcohol_weekdays, y = col)) +
    geom_count() +  
    labs(title = col_name, x = "Alcohol_Weekdays", y = col_name)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

lapply(names(base_datos_str), function(col_name) {
  col <- base_datos_str[[col_name]]
  ggplot(data.frame(col), aes(x = base_datos_str$alcohol_weekends, y = col)) +
    geom_count() +  
    labs(title = col_name, x = "Alcohol_Weekends", y = col_name)
})
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

base_datos_str |> 
  count(alcohol_weekdays, gender)
## # A tibble: 6 × 3
##   alcohol_weekdays gender     n
##   <chr>            <chr>  <int>
## 1 High             Female     9
## 2 High             Male      25
## 3 Low              Female   363
## 4 Low              Male     209
## 5 Moderate         Female    11
## 6 Moderate         Male      32
base_datos_str |> 
  count(alcohol_weekdays, gender) |>  
  ggplot(aes(x = alcohol_weekdays, y = gender)) +
  geom_tile(aes(fill = n))

# crear un mapa de calor
create_heatmap <- function(col_name) {
  count_data <- base_datos_str %>% count(alcohol_weekdays, !!sym(col_name))
  ggplot(count_data, aes(x = alcohol_weekdays, y = !!sym(col_name))) +
    geom_tile(aes(fill = n), color = "white") +
    scale_fill_gradient(low = "white", high = "blue") +
    labs(title = paste("Comparación de alcohol entre semana con", col_name),
         x = "Alcohol entre semana", y = col_name)
}

# aplicar la función a tods las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekdays")], create_heatmap)

print(heatmap_plots)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

# crear un mapa de calor
create_heatmap <- function(col_name) {
  count_data <- base_datos_str %>% count(alcohol_weekends, !!sym(col_name))
  ggplot(count_data, aes(x = alcohol_weekends, y = !!sym(col_name))) +
    geom_tile(aes(fill = n), color = "white") +
    scale_fill_gradient(low = "white", high = "blue") +
    labs(title = paste("Comparación de alcohol en fin de semana con", col_name),
         x = "Alcohol en fin de semana", y = col_name)
}

# aplicar la unción a todas las columnas
heatmap_plots <- lapply(names(base_datos_str)[-which(names(base_datos_str) == "alcohol_weekends")], create_heatmap)

print(heatmap_plots)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

3.2 Propuesta métodologica

Tablas de contingencia Alcohol entre semana

# Crear tablas de contingencia para cada columna cualitativa y la de cantidad de alcohol entre semana
tablas_contingencias_1 <- lapply(base_datos_str, function(col) {
  table(col, base_datos_str$alcohol_weekdays)
})

print(tablas_contingencias_1)
## $school
##                       
## col                    High Low Moderate
##   Gabriel Pereira        22 379       22
##   Mousinho da Silveira   12 193       21
## 
## $gender
##         
## col      High Low Moderate
##   Female    9 363       11
##   Male     25 209       32
## 
## $housing_type
##        
## col     High Low Moderate
##   Rural   10 168       19
##   Urban   24 404       24
## 
## $family_size
##          
## col       High Low Moderate
##   Above 3   23 408       26
##   Up to 3   11 164       17
## 
## $parental_status
##                  
## col               High Low Moderate
##   Living Together   31 501       37
##   Separated          3  71        6
## 
## $mother_education
##                         
## col                      High Low Moderate
##   High School               8 118       13
##   Higher Education          9 155       11
##   Lower Secondary School    7 173        6
##   None                      0   5        1
##   Primary School           10 121       12
## 
## $father_education
##                         
## col                      High Low Moderate
##   High School               5 117        9
##   Higher Education          7 110       11
##   Lower Secondary School   11 188       10
##   None                      0   7        0
##   Primary School           11 150       13
## 
## $mother_work
##            
## col         High Low Moderate
##   Health       0  45        3
##   Homemaker    8 119        8
##   other       14 229       15
##   Services     9 118        9
##   Teacher      3  61        8
## 
## $father_work
##            
## col         High Low Moderate
##   Health       1  20        2
##   Homemaker    0  39        3
##   other       17 329       21
##   Services    14 150       17
##   Teacher      2  34        0
## 
## $reason_school_choice
##                    
## col                 High Low Moderate
##   Course Preference   13 258       14
##   Near Home           10 127       12
##   Other                7  56        9
##   Reputation           4 131        8
## 
## $legal_responsibility
##         
## col      High Low Moderate
##   Father    8 133       12
##   Mother   20 408       27
##   Other     6  31        4
## 
## $commute_time
##               
## col            High Low Moderate
##   15 to 30 min   11 189       13
##   30 min to 1h    4  42        8
##   More than 1h    3  12        1
##   Up to 15 min   16 329       21
## 
## $weekly_study_time
##                
## col             High Low Moderate
##   2 to 5h         14 278       13
##   5 to 10h         2  94        1
##   More than 10h    2  29        4
##   Up to 2h        16 171       25
## 
## $extra_educational_support
##      
## col   High Low Moderate
##   No    30 510       41
##   Yes    4  62        2
## 
## $parental_educational_support
##      
## col   High Low Moderate
##   No    12 215       24
##   Yes   22 357       19
## 
## $private_tutoring
##      
## col   High Low Moderate
##   No    31 539       40
##   Yes    3  33        3
## 
## $extracurricular_activities
##      
## col   High Low Moderate
##   No    14 296       24
##   Yes   20 276       19
## 
## $attended_daycare
##      
## col   High Low Moderate
##   No    10 109        9
##   Yes   24 463       34
## 
## $desire_graduate_education
##      
## col   High Low Moderate
##   No     8  55        6
##   Yes   26 517       37
## 
## $has_internet
##      
## col   High Low Moderate
##   No     5 135       11
##   Yes   29 437       32
## 
## $is_dating
##      
## col   High Low Moderate
##   No    14 364       32
##   Yes   20 208       11
## 
## $good_family_relationship
##       
## col    High Low Moderate
##   Fair    6  92        3
##   Good   24 440       33
##   Poor    4  40        7
## 
## $free_time_after_school
##           
## col        High Low Moderate
##   High       17 203       26
##   Low         7 135       10
##   Moderate   10 234        7
## 
## $time_with_friends
##           
## col        High Low Moderate
##   High       23 202       26
##   Low         4 182        7
##   Moderate    7 188       10
## 
## $alcohol_weekdays
##           
## col        High Low Moderate
##   High       34   0        0
##   Low         0 572        0
##   Moderate    0   0       43
## 
## $alcohol_weekends
##           
## col        High Low Moderate
##   High       26  74       32
##   Low         4 391        2
##   Moderate    4 107        9
## 
## $health_status
##       
## col    High Low Moderate
##   Fair    9 110        5
##   Good   19 310       28
##   Poor    6 152       10

Diagrama de mosaico

# crear una representación gráfica de las tablas de contingencia

lapply(seq_along(tablas_contingencias_1), function(i) {
  mosaicplot(tablas_contingencias_1[[i]],
              color = TRUE,
              xlab = "Alcohol entre semana",
              ylab = names(tablas_contingencias_1[[i]])[2], 
              main = paste("Alcohol entre Semana y", names(base_datos_str)[i][1]))
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL

Prueba Chi-Cuadrado

# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_1 <- lapply(tablas_contingencias_1, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_1
## $school
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 4.0191, df = 2, p-value = 0.134
## 
## 
## $gender
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 39.436, df = 2, p-value = 2.733e-09
## 
## 
## $housing_type
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 4.1675, df = 2, p-value = 0.1245
## 
## 
## $family_size
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 2.3978, df = 2, p-value = 0.3015
## 
## 
## $parental_status
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 0.49529, df = 2, p-value = 0.7806
## 
## 
## $mother_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 9.3106, df = 8, p-value = 0.3168
## 
## 
## $father_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 4.1102, df = 8, p-value = 0.847
## 
## 
## $mother_work
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 6.1653, df = 8, p-value = 0.6287
## 
## 
## $father_work
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 10.683, df = 8, p-value = 0.2203
## 
## 
## $reason_school_choice
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 12.356, df = 6, p-value = 0.05448
## 
## 
## $legal_responsibility
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 9.6798, df = 4, p-value = 0.04618
## 
## 
## $commute_time
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 13.687, df = 6, p-value = 0.03333
## 
## 
## $weekly_study_time
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 23.815, df = 6, p-value = 0.0005648
## 
## 
## $extra_educational_support
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1.696, df = 2, p-value = 0.4283
## 
## 
## $parental_educational_support
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 5.7747, df = 2, p-value = 0.05572
## 
## 
## $private_tutoring
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 0.60637, df = 2, p-value = 0.7385
## 
## 
## $extracurricular_activities
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1.7848, df = 2, p-value = 0.4097
## 
## 
## $attended_daycare
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 2.2162, df = 2, p-value = 0.3302
## 
## 
## $desire_graduate_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 7.0739, df = 2, p-value = 0.0291
## 
## 
## $has_internet
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1.5606, df = 2, p-value = 0.4583
## 
## 
## $is_dating
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 9.4615, df = 2, p-value = 0.00882
## 
## 
## $good_family_relationship
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 7.4854, df = 4, p-value = 0.1124
## 
## 
## $free_time_after_school
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 15.161, df = 4, p-value = 0.004379
## 
## 
## $time_with_friends
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 24.017, df = 4, p-value = 7.925e-05
## 
## 
## $alcohol_weekdays
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
## 
## 
## $alcohol_weekends
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
## 
## 
## $health_status
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 4.2113, df = 4, p-value = 0.3782

Tablas de contingencia Alcohol fin de semana

# Crear tablas de contingencia para cada columna cualitativa y la de cantidad de alcohol en fines de semana
tablas_contingencias_2 <- lapply(base_datos_str, function(col) {
  table(col, base_datos_str$alcohol_weekends)
})

print(tablas_contingencias_2)
## $school
##                       
## col                    High Low Moderate
##   Gabriel Pereira        87 259       77
##   Mousinho da Silveira   45 138       43
## 
## $gender
##         
## col      High Low Moderate
##   Female   37 275       71
##   Male     95 122       49
## 
## $housing_type
##        
## col     High Low Moderate
##   Rural   39 120       38
##   Urban   93 277       82
## 
## $family_size
##          
## col       High Low Moderate
##   Above 3   85 288       84
##   Up to 3   47 109       36
## 
## $parental_status
##                  
## col               High Low Moderate
##   Living Together  117 341      111
##   Separated         15  56        9
## 
## $mother_education
##                         
## col                      High Low Moderate
##   High School              33  75       31
##   Higher Education         35 110       30
##   Lower Secondary School   28 128       30
##   None                      2   3        1
##   Primary School           34  81       28
## 
## $father_education
##                         
## col                      High Low Moderate
##   High School              30  70       31
##   Higher Education         29  80       19
##   Lower Secondary School   39 133       37
##   None                      0   7        0
##   Primary School           34 107       33
## 
## $mother_work
##            
## col         High Low Moderate
##   Health      10  26       12
##   Homemaker   27  84       24
##   other       46 168       44
##   Services    31  75       30
##   Teacher     18  44       10
## 
## $father_work
##            
## col         High Low Moderate
##   Health       6  17        0
##   Homemaker    7  31        4
##   other       71 221       75
##   Services    45  99       37
##   Teacher      3  29        4
## 
## $reason_school_choice
##                    
## col                 High Low Moderate
##   Course Preference   59 181       45
##   Near Home           32  89       28
##   Other               19  39       14
##   Reputation          22  88       33
## 
## $legal_responsibility
##         
## col      High Low Moderate
##   Father   32  94       27
##   Mother   92 280       83
##   Other     8  23       10
## 
## $commute_time
##               
## col            High Low Moderate
##   15 to 30 min   43 130       40
##   30 min to 1h   12  31       11
##   More than 1h    7   9        0
##   Up to 15 min   70 227       69
## 
## $weekly_study_time
##                
## col             High Low Moderate
##   2 to 5h         52 193       60
##   5 to 10h         5  72       20
##   More than 10h    6  25        4
##   Up to 2h        69 107       36
## 
## $extra_educational_support
##      
## col   High Low Moderate
##   No   123 351      107
##   Yes    9  46       13
## 
## $parental_educational_support
##      
## col   High Low Moderate
##   No    64 145       42
##   Yes   68 252       78
## 
## $private_tutoring
##      
## col   High Low Moderate
##   No   120 375      115
##   Yes   12  22        5
## 
## $extracurricular_activities
##      
## col   High Low Moderate
##   No    61 210       63
##   Yes   71 187       57
## 
## $attended_daycare
##      
## col   High Low Moderate
##   No    33  71       24
##   Yes   99 326       96
## 
## $desire_graduate_education
##      
## col   High Low Moderate
##   No    20  35       14
##   Yes  112 362      106
## 
## $has_internet
##      
## col   High Low Moderate
##   No    27 101       23
##   Yes  105 296       97
## 
## $is_dating
##      
## col   High Low Moderate
##   No    87 248       75
##   Yes   45 149       45
## 
## $good_family_relationship
##       
## col    High Low Moderate
##   Fair   26  55       20
##   Good   92 314       91
##   Poor   14  28        9
## 
## $free_time_after_school
##           
## col        High Low Moderate
##   High       66 126       54
##   Low        26 106       20
##   Moderate   40 165       46
## 
## $time_with_friends
##           
## col        High Low Moderate
##   High       95 106       50
##   Low        14 154       25
##   Moderate   23 137       45
## 
## $alcohol_weekdays
##           
## col        High Low Moderate
##   High       26   4        4
##   Low        74 391      107
##   Moderate   32   2        9
## 
## $alcohol_weekends
##           
## col        High Low Moderate
##   High      132   0        0
##   Low         0 397        0
##   Moderate    0   0      120
## 
## $health_status
##       
## col    High Low Moderate
##   Fair   24  78       22
##   Good   85 207       65
##   Poor   23 112       33

Diagrama de mosaico

# crear una representación gráfica de las tablas de contingencia

lapply(seq_along(tablas_contingencias_2), function(i) {
  mosaicplot(tablas_contingencias_2[[i]],
              color = TRUE,
              xlab = "Alcohol Fin de Semana",
              ylab = names(tablas_contingencias_2[[i]])[2], 
              main = paste("Alcohol Fin de Semana y", names(base_datos_str)[i][1]))
})

## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL
## 
## [[5]]
## NULL
## 
## [[6]]
## NULL
## 
## [[7]]
## NULL
## 
## [[8]]
## NULL
## 
## [[9]]
## NULL
## 
## [[10]]
## NULL
## 
## [[11]]
## NULL
## 
## [[12]]
## NULL
## 
## [[13]]
## NULL
## 
## [[14]]
## NULL
## 
## [[15]]
## NULL
## 
## [[16]]
## NULL
## 
## [[17]]
## NULL
## 
## [[18]]
## NULL
## 
## [[19]]
## NULL
## 
## [[20]]
## NULL
## 
## [[21]]
## NULL
## 
## [[22]]
## NULL
## 
## [[23]]
## NULL
## 
## [[24]]
## NULL
## 
## [[25]]
## NULL
## 
## [[26]]
## NULL
## 
## [[27]]
## NULL

Prueba Chi-Cuadrado

# aplicar la prueba de independencia de chi-cuadrado a cada tabla de contingencia
chi_cuadrado_2 <- lapply(tablas_contingencias_2, chisq.test)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
chi_cuadrado_2
## $school
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 0.085819, df = 2, p-value = 0.958
## 
## 
## $gender
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 69.654, df = 2, p-value = 7.495e-16
## 
## 
## $housing_type
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 0.14167, df = 2, p-value = 0.9316
## 
## 
## $family_size
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 3.171, df = 2, p-value = 0.2049
## 
## 
## $parental_status
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 3.8628, df = 2, p-value = 0.1449
## 
## 
## $mother_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 10.255, df = 8, p-value = 0.2476
## 
## 
## $father_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 9.9856, df = 8, p-value = 0.266
## 
## 
## $mother_work
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 7.0431, df = 8, p-value = 0.532
## 
## 
## $father_work
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 18.679, df = 8, p-value = 0.01668
## 
## 
## $reason_school_choice
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 6.8145, df = 6, p-value = 0.3383
## 
## 
## $legal_responsibility
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1.0722, df = 4, p-value = 0.8987
## 
## 
## $commute_time
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 8.0027, df = 6, p-value = 0.2379
## 
## 
## $weekly_study_time
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 37.497, df = 6, p-value = 1.409e-06
## 
## 
## $extra_educational_support
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 2.4215, df = 2, p-value = 0.298
## 
## 
## $parental_educational_support
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 6.8137, df = 2, p-value = 0.03314
## 
## 
## $private_tutoring
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 3.0945, df = 2, p-value = 0.2128
## 
## 
## $extracurricular_activities
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1.8354, df = 2, p-value = 0.3994
## 
## 
## $attended_daycare
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 3.1753, df = 2, p-value = 0.2044
## 
## 
## $desire_graduate_education
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 4.3507, df = 2, p-value = 0.1136
## 
## 
## $has_internet
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 2.7657, df = 2, p-value = 0.2509
## 
## 
## $is_dating
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 0.53281, df = 2, p-value = 0.7661
## 
## 
## $good_family_relationship
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 5.057, df = 4, p-value = 0.2815
## 
## 
## $free_time_after_school
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 18.865, df = 4, p-value = 0.0008356
## 
## 
## $time_with_friends
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 94.004, df = 4, p-value < 2.2e-16
## 
## 
## $alcohol_weekdays
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 171.75, df = 4, p-value < 2.2e-16
## 
## 
## $alcohol_weekends
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 1298, df = 4, p-value < 2.2e-16
## 
## 
## $health_status
## 
##  Pearson's Chi-squared test
## 
## data:  X[[i]]
## X-squared = 7.4814, df = 4, p-value = 0.1125